# setup and load packages
knitr::opts_chunk$set(
echo = TRUE, message = FALSE, warning = FALSE, fig.path = "Convergence_plots/", dev = "svg"
)
library(ggplot2) # for plotting
suppressPackageStartupMessages(
library(cowplot) # pretty ggplot
)
library(reshape2) # rearranging/'melt'-ing tables
library(plyr) # summarising (pivot-table-style) tables
Heavy chains - Same V gene, Same J gene, Same CDR3 length, 85% CDR3 identity
CDR3s with length > 30 AAs & unproductive sequences are ignored here for simplicity. Considered here the following:
convg_data <- test_data[which(test_data$Num_AAs <= 30 &
test_data$V.DOMAIN.Functionality == "productive"), ]
convg_data <- convg_data[which(convg_data$SampleType2 %in% c("Healthy",
"COVID-19",
"COVID-19Recovered")), ]
# first take list of IgH that binds Spike - definite ones from PDB:
pdb_ab <- read.table('known_binders/20210519ver_sabdab_SARSCoV2Abs_sequences_VJgenes.tsv',
sep = "\t", header = TRUE, stringsAsFactors = FALSE)
# take only CDR3, ie removing the canonical C and W that flanks CDR3
pdb_ab$CDRH3 <- substr(pdb_ab$CDRH3, 2, nchar(pdb_ab$CDRH3)-1)
pdb_ab$struct_id <- apply(pdb_ab[, 1:2], MARGIN = 1, paste, collapse = "|")
# pdb_ab$Specificity <- "PDB"
pdb_ab <- ddply(pdb_ab, c("CDRH3", "Specificity", "Vgene", "Jfamily"), summarise,
fasta_header = paste(struct_id, collapse = ","),
molecule_name = paste(unique(molecule_name), collapse = ","))
pdb_ab$source <- "PDB"
pdb_ab$molecule_name <- apply(pdb_ab[, c("fasta_header", "molecule_name")],
MARGIN = 1, function(x) paste(x[1], x[2], sep = ": "))
pdb_ab <- pdb_ab[, -which(colnames(pdb_ab) == "fasta_header")]
# also read in known binders determined from experiments, obtained from publications
cloned_ab_pub <- c("known_binders/Dugan_et_al.txt",
"known_binders/Brower_et_al.txt",
"known_binders/Kreer_et_al.txt",
"known_binders/Robbiani_et_al.txt",
"known_binders/Gaebler_et_al.txt",
"known_binders/Zost_et_al.txt")
cloned_ab_pub <- lapply(cloned_ab_pub, read.table, stringsAsFactors=FALSE,
sep = "\t", header = TRUE, comment.char = "#")
cloned_ab_pub[[1]]$CDRH3 <- substr(cloned_ab_pub[[1]]$CDRH3, 2,
nchar(cloned_ab_pub[[1]]$CDRH3)-1)
cloned_ab_pub[[2]]$CDRH3 <- substr(cloned_ab_pub[[2]]$CDRH3, 2,
nchar(cloned_ab_pub[[2]]$CDRH3)-1)
cloned_ab_pub[[3]]$CDRH3 <- substr(cloned_ab_pub[[3]]$CDRH3, 2,
nchar(cloned_ab_pub[[3]]$CDRH3)-1)
cloned_ab_pub[[4]]$Specificity <- "Spike/RBD"
cloned_ab_pub[[5]]$Specificity <- "Spike/RBD"
cloned_ab_pub[[1]]$source <- "Dugan_et_al"
cloned_ab_pub[[2]]$source <- "Brouwer_et_al"
cloned_ab_pub[[3]]$source <- "Kreer_et_al"
cloned_ab_pub[[4]]$source <- "Robbiani_et_al"
cloned_ab_pub[[5]]$source <- "Gaebler_et_al"
cloned_ab_pub[[6]]$source <- "Zost_et_al"
cloned_ab_pub <- do.call("rbind", cloned_ab_pub)
cloned_ab_pub <- ddply(cloned_ab_pub, c("CDRH3", "Specificity", "Vgene", "Jfamily"),
summarise,
molecule_name = paste(sort(unique(Antibody)), collapse = ","),
source = paste(sort(unique(source)), collapse = ","))
cloned_ab <- rbind(pdb_ab, cloned_ab_pub)
cloned_ab <- ddply(cloned_ab, c("CDRH3", "Specificity", "Vgene", "Jfamily"), summarise,
molecule_name = paste(sort(unique(molecule_name)), collapse = ","),
source = paste(sort(unique(source)), collapse = ","))
cloned_ab$Num_AAs <- sapply(cloned_ab$CDRH3, nchar)
# group sequences by same V and J genes
convg_data <- convg_data[, c("AA_CDR3_edited", "SampleType2", "Vgene", "Jfamily",
"Seq_ID", "SampleType2", "Num_AAs")]
colnames(convg_data) <- colnames(cloned_ab)
convg_data <- rbind(convg_data, cloned_ab)
convg_data <- split(convg_data, f = list(convg_data$Vgene, convg_data$Jfamily,
convg_data$Num_AAs))
convg_data <- convg_data[sapply(convg_data, nrow) > 1]
# for each V-J combination, calculate % identity, take >= 85%
converge <- lapply(names(convg_data), function(x){
tb <- convg_data[[x]]
print(x)
sims <- stringdist::stringsimmatrix(tb$CDRH3)
sims[lower.tri(sims)] <- NA
sims <- reshape2::melt(sims)
sims <- sims[which(!is.na(sims[, 3])), ]
sims <- sims[which(sims[, 1] != sims[, 2]), ]
sims <- sims[which(sims[, 3] >= 0.85), ]
sims[, 1] <- factor(sims[, 1], levels = 1:nrow(tb), labels = tb$molecule_name)
sims[, 2] <- factor(sims[, 2], levels = 1:nrow(tb), labels = tb$molecule_name)
sims
})
converge <- converge[sapply(converge, function(x) nrow(x) > 0)]
converge <- do.call("rbind", converge)
# map back metadata about the sequences
convg_data <- do.call("rbind", convg_data)
converge <- merge(converge, convg_data[, c("molecule_name", "CDRH3", "Specificity",
"Vgene","Jfamily")],
by.x = "Var1", by.y = "molecule_name", all.x = TRUE, all.y = FALSE,
sort = FALSE)
colnames(converge)[4:7] <- paste0("Seq1_", colnames(converge)[4:7])
converge <- merge(converge, convg_data[, c("molecule_name", "CDRH3", "Specificity",
"Vgene","Jfamily")],
by.x = "Var2", by.y = "molecule_name", all.x = TRUE, all.y = FALSE,
sort = FALSE)
colnames(converge)[8:11] <- paste0("Seq2_", colnames(converge)[8:11])
colnames(converge)[1:3] <- c("Seq2", "Seq1", "identity")
# retain only the sequences coming from different patients
converge$Seq1_PatientID <- sapply(as.character(converge$Seq1),
function(x) gsub("D[0-9]*$", "",
unlist(strsplit(x, split = "_"))[1],
perl = TRUE))
converge$Seq2_PatientID <- sapply(as.character(converge$Seq2),
function(x) gsub("D[0-9]*$", "",
unlist(strsplit(x, split = "_"))[1],
perl = TRUE))
converge <- converge[which(converge$Seq1_PatientID != converge$Seq2_PatientID), ]
saveRDS(converge, "ConvergeSequences_SpikeAbPDB_graph_SameVJPatient.rds")
Added to this set of sequences known binders of SARS-CoV-2 proteins from the following sources:
If V & J germline gene assignment are missing from data source they are annotated using either IMGT/High-VQuest (if nucleotide sequences are available) or IMGT/DomainGapAlign (if only amino acid sequences are available). Total: 822 unique CDRH3 sequences from PDB and publications.
Evaluate CDRH3 AA identity, and construct a network connecting sequences (from data / known binders). Edges are drawn between pairs of sequences using identical rules as in connecting the convergent repertoire sequences.
converge <- readRDS("ConvergeSequences_SpikeAbPDB_graph_SameVJPatient.rds")
convg_data <- do.call("rbind", convg_data)
# process in graph object
converge_graph <- igraph::graph_from_data_frame(converge, directed = FALSE)
# the graph is very disconnected
components <- igraph::components(converge_graph)
# size distribution of components
ggplot(data.frame(components$csize)) + geom_histogram(aes(components.csize)) +
scale_x_log10(name = "Cluster size") + cowplot::theme_cowplot() +
ggtitle("Size distribution of clusters") + ylab("Number of clusters")
# components with at least 10 sequences - makeup breakdown by COVID/Healthy/YFV/PDB
membership <- data.frame(components$membership)
membership$Seq_ID <- rownames(membership)
colnames(membership)[1] <- "cluster"
membership <- list(
membership,
data.frame(cluster = 1:length(components$csize), cluster_size = components$csize)
)
membership <- merge(membership[[1]], membership[[2]], by = "cluster",
all.x = TRUE, all.y = FALSE, sort = FALSE)
converge_nodes <- list(
converge[, which(grepl("^Seq1", colnames(converge)))],
converge[, which(grepl("^Seq2", colnames(converge)))]
)
converge_nodes <- unique(do.call("rbind", lapply(converge_nodes, function(tb){
colnames(tb) <- c("Seq_ID", "CDRH3", "Specificity", "Vgene", "Jfamily", "PatientID")
tb
})))
membership <- merge(membership, converge_nodes,
all.x = TRUE, all.y = FALSE, sort= FALSE)
membership$SampleType <- factor(membership$Specificity,
levels = c("COVID-19", "Healthy", "COVID-19Recovered",
"PDB", "Spike", "Spike/RBD"),
labels = c("Repertoire", "Repertoire", "Repertoire",
"Binders", "Binders", "Binders"))
# those clusters with both repertoire sequences and known binders
membership_both <- ddply(membership, "cluster", summarise,
both = ("Repertoire" %in% SampleType & "Binders" %in% SampleType))
membership_both <- membership_both[which(membership_both$both), "cluster"]
membership$both <- (membership$cluster %in% membership_both)
membership <- merge(membership, test_data[, c("Seq_ID", "NumInClone", "CloneGroup", "Age")],
by = "Seq_ID", all.x = TRUE, all.y = FALSE, sort = FALSE)
membership$SampleType2 <- membership$Specificity
saveRDS(membership, 'convergent_cluster_membership.rds')
# largest clusters & V/J gene use
cluster_summary <- membership[, c("cluster", "cluster_size", "SampleType2", "Age", "PatientID", "Seq_ID",
"Vgene", "Jfamily", "both")]
cluster_summary$AgeGroup <- sapply(cluster_summary$Age, function(x){
if(x %in% c("Young", "Old")){
if(x == "Young") return("leq50")
if(x == "Old") return("geq60")
} else if(is.na(x)) return(NA) else {
x <- as.numeric(x)
if(x <= 50) return("leq50")
if(x >= 60) return("geq60")
return(NA)
}
return(NA)
})
cluster_summary <- cluster_summary[order(cluster_summary$cluster_size, decreasing = TRUE), ]
cluster_large <- cluster_summary[which(cluster_summary$cluster_size >= 10), ]
cluster_large$SampleType2 <- replace(cluster_large$SampleType2, which(cluster_large$SampleType2 == "PDB"),
"Spike/RBD")
cluster_large <- ddply(cluster_large, c("cluster", "cluster_size", "Vgene", "Jfamily", "AgeGroup", "SampleType2"),
summarise, V1 = length(Seq_ID), n_patient = length(unique(PatientID)))
cluster_large$cluster <- factor(cluster_large$cluster,
levels = unique(cluster_large$cluster[order(cluster_large$cluster_size,
decreasing = TRUE)]))
# n sequences per cluster
g1 <- ggplot(cluster_large, aes(x = cluster, y = V1, fill = SampleType2)) + geom_bar(stat = "identity") +
cowplot::theme_cowplot() + theme(axis.text.x = element_blank()) +
scale_fill_manual(values = c("Healthy" = "#7FC97F", "COVID-19" = "#BEAED4", "Spike" = "#99CCFF",
"COVID-19Recovered" = "#FDC086", "Spike/RBD" = "#0000FF"), name = "",
labels = c("CV19", "CV19-Recovered", "Healthy", "Spike", "Spike/RBD")) +
xlab("Convergent clusters") +ylab("Number of\nsequences") + ggtitle("Clusters with 10 or more sequences (n = 43)")
# Vgene usage
g2 <- ggplot(cluster_large, aes(x = cluster, y = Vgene)) + geom_tile(fill = "black") +
cowplot::theme_cowplot() + theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) + xlab("") +ylab("V usage")
# Jfamily usage
g3 <- ggplot(cluster_large, aes(x = cluster, y = Jfamily)) + geom_tile(fill = "black") +
cowplot::theme_cowplot() + theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) + xlab("") +ylab("J usage")
# number of donors represented (ie with sequences present) in each cluster
g4 <- ggplot(cluster_large[which(!is.na(cluster_large$AgeGroup)), ],
aes(x = cluster, y = n_patient, fill = AgeGroup)) + geom_bar(stat = "identity") +
cowplot::theme_cowplot() + theme(axis.text.x = element_blank()) +
scale_fill_manual(values = c("leq50" = "grey30", "geq60" = "grey70"),
label = c(expression("Age ">=60), expression("Age "<=50)), name = "") +
xlab("Convergent clusters") +ylab("Number of\ndonors")
cowplot::plot_grid(g1, g2, g3, g4, ncol = 1, axis = "lr", align = "v", rel_heights = c(2, 3, 1, 1))
Clusters of sequences with both repertoire sequences and known binders - what are the targets?
both_targets <- unique(membership[membership$cluster %in% membership_both,
c("Seq_ID", "cluster", "cluster_size", "SampleType2", "Specificity")])
both_targets$Specificity <- replace(both_targets$Specificity,
which(grepl("7cws|7l2e|7m8j|7lcn", both_targets$Seq_ID)),
"Spike/NTD")
both_targets$Specificity <- replace(both_targets$Specificity,
which(both_targets$Specificity == "PDB"),
"Spike/RBD")
both_targets <- ddply(both_targets, c("cluster", "cluster_size", "SampleType2", "Specificity"), nrow)
both_targets <- both_targets[order(both_targets$cluster_size, decreasing = TRUE), ]
both_targets$cluster <- factor(both_targets$cluster, levels = unique(both_targets$cluster))
# n sequences per cluster
g4 <- ggplot(unique(both_targets[, c("cluster", "Specificity", "V1")]),
aes(x = cluster, y = V1, fill = Specificity)) + geom_bar(stat = "identity") +
cowplot::theme_cowplot() + theme(axis.text.x = element_blank()) +
scale_fill_manual(values = c("Healthy" = "#7FC97F", "COVID-19" = "#BEAED4",
"COVID-19Recovered" = "#FDC086", "Spike/NTD" = "#bd40ac",
"Spike" = "#99CCFF", "Spike/RBD" = "#0000FF"),name = "",
labels = c("CV19", "CV19-Recovered", "Healthy", "Spike", "Spike/NTD", "Spike/RBD")) +
xlab("") +ylab("Number of\nsequences") + ggtitle("Clusters with binders (n = 28)")
# targets
g5 <- ggplot(both_targets[which(grepl("Spike", both_targets$Specificity)), ],
aes(x = cluster, y = Specificity)) + geom_tile(fill = "black") +
cowplot::theme_cowplot() + theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank()) + xlab("Convergent clusters") +ylab("")
cowplot::plot_grid(g4, g5, ncol = 1, axis = "lr", align = "v", rel_heights = c(2,1))
library(visNetwork)
visualiseNetwork <- function(graph, nodes, node_annotation, cluster, node_id_col = "Seq_ID",
node_color_col = "Specificity",
node_colors = c("Healthy" = "#7FC97F", "COVID-19" = "#BEAED4",
"COVID-19Recovered" = "#FDC086", "PDB" = "#000000",
"Spike" = "#99CCFF", "Spike/RBD" = "#0000FF"))
{
graphData <- visNetwork::toVisNetworkData(igraph::induced_subgraph(graph, nodes))
graphData$edges$width <- (15 - 1) * ( graphData$edges$identity - 0.85 ) / (1 - 0.85) + 1
graphData$edges$color <- "grey"
graphData$nodes <- merge(graphData$nodes, node_annotation, by.x = "id", by.y = node_id_col,
all.x = TRUE, all.y = FALSE, sort = FALSE)
graphData$nodes$label <- NA # no labels
graphData$nodes$color <- graphData$nodes$Specificity
for( i in names(node_colors) ){
graphData$nodes$color <- replace(graphData$nodes$color,
which(graphData$nodes[, node_color_col] == i),
node_colors[i])
}
lnodes <- data.frame(node_colors)
colnames(lnodes)[1] <- "color"
lnodes$label <- rownames(lnodes)
lnodes$color.border <- "white"
lnodes$title <- ""
lnodes$id <- 1:nrow(lnodes)
visNetwork(nodes = graphData$nodes, edges = graphData$edges, main = paste0("Cluster ", cluster)) %>%
visLegend(addNodes = lnodes, useGroups = FALSE) %>% visIgraphLayout()
}
visualiseNetwork(converge_graph, membership[membership$cluster == 591, 1], cluster = 591,
node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 676, 1], cluster = 676,
node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 1239, 1], cluster = 1239,
node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 1479, 1], cluster = 1479,
node_annotation = converge_nodes)
visualiseNetwork(converge_graph, membership[membership$cluster == 1888, 1], cluster = 1888,
node_annotation = converge_nodes)
CDRH3 AA Sequence logos of these clusters:
seqs <- membership[membership$cluster %in% c(591, 676, 1239, 1479, 1888), c("Seq_ID", "cluster", "CDRH3")]
seqs <- merge(seqs, ddply(seqs, "cluster", nrow), by = "cluster")
seqs$cluster <- paste0("Cluster ", seqs$cluster, "\n(n = ", seqs$V1, ")")
library(ggseqlogo)
cowplot::plot_grid(
# cluster 591
ggseqlogo(seqs[which(grepl("Cluster 591", seqs$cluster)), "CDRH3"]) +
ylab(unique(seqs[which(grepl("Cluster 591", seqs$cluster)), "cluster"])) +
theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
# cluster 676
ggseqlogo(seqs[which(grepl("Cluster 676", seqs$cluster)), "CDRH3"]) +
ylab(unique(seqs[which(grepl("Cluster 676", seqs$cluster)), "cluster"])) +
theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
# cluster 1239
ggseqlogo(seqs[which(grepl("Cluster 1239", seqs$cluster)), "CDRH3"]) +
ylab(unique(seqs[which(grepl("Cluster 1239", seqs$cluster)), "cluster"])) +
theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
# cluster 1479
ggseqlogo(seqs[which(grepl("Cluster 1479", seqs$cluster)), "CDRH3"]) +
ylab(unique(seqs[which(grepl("Cluster 1479", seqs$cluster)), "cluster"])) +
theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
# cluster 1888
ggseqlogo(seqs[which(grepl("Cluster 1888", seqs$cluster)), "CDRH3"]) +
ylab(unique(seqs[which(grepl("Cluster 1888", seqs$cluster)), "cluster"])) +
theme_cowplot() + theme(legend.position = "none", axis.text = element_blank(), axis.ticks = element_blank(),
axis.line = element_blank(), axis.title.y = element_text(angle = 0, vjust = 0.5)),
align = "v", axis = "lr", ncol = 1
)
membership$large <- (membership$cluster_size >= 10)
membership$SampleType2 <- replace(membership$Specificity,
which(membership$Specificity == "COVID-19Recovered"),
"COVID-19")
test_data$SampleType3 <- replace(test_data$SampleType2,
which(test_data$SampleType2 == "COVID-19Recovered"),
"COVID-19")
expansion <- ddply(membership, c("SampleType2", "large", "CloneGroup"), nrow)
expansion <- expansion[which(expansion$SampleType2 %in% c("COVID-19", "Healthy")), ]
expansion <- merge(
expansion, ddply(membership, c("SampleType2", "large"), nrow), all.x = TRUE, all.y = FALSE,
by = c("SampleType2", "large"), sort = FALSE
)
expansion$perc <- expansion[, 4] / expansion[, 5]
colnames(expansion) <- c("SampleType2", "large", "CloneGroup", "n", "total", "perc")
expansion$type <- "convergent"
expansion <- list(
expansion,
merge(
ddply(test_data[!test_data$Seq_ID %in% membership$Seq_ID, ],
c("SampleType3", "CloneGroup"), nrow),
ddply(test_data[!test_data$Seq_ID %in% membership$Seq_ID, ],
"SampleType2", nrow), all.x = TRUE, all.y = FALSE,
by.x = "SampleType3", by.y = "SampleType2", sort = FALSE
)
)
expansion[[2]]$perc <- expansion[[2]][, 3] / expansion[[2]][, 4]
colnames(expansion[[2]]) <- c("SampleType2", "CloneGroup", "n", "total", "perc")
expansion[[2]] <- expansion[[2]][which(expansion[[2]]$SampleType2 %in% c("COVID-19", "Healthy")), ]
expansion[[2]]$type <- "other sequences"
expansion[[2]]$large <- "others"
expansion[[2]] <- expansion[[2]][, c("SampleType2", "large", "CloneGroup", "n", "total", "perc", "type")]
expansion <- do.call("rbind", expansion)
expansion$CloneGroup <- factor(expansion$CloneGroup,
levels = c("Unique", "2", "3", "4&5", "6to9", ">10"),
labels = c("Unique", "2", "3", "4-5", "6-9",
"10 or more"))
expansion$large <- factor(expansion$large, levels = c(TRUE, FALSE, "others"),
labels = c("convergent\n(10 or more\nsequences)", "convergent\n(<10 sequences)",
"other\nsequences"))
ggplot(expansion, aes(y = CloneGroup, x = perc)) +
geom_bar(stat = "identity", position = position_dodge(), width = 0.5) +
scale_x_continuous(labels = scales::percent, name = "% sequences") +
ylab("Number of sequences in clone") + cowplot::theme_cowplot() +
facet_grid(large ~ SampleType2) + theme(strip.text = element_text(size = 9))
Another way of presenting this, treating NumInClone as a continuous variable:
expansion_cont <- membership[membership$SampleType2 %in% c("COVID-19", "Healthy"), c("Seq_ID", "large", "NumInClone", "SampleType2")]
expansion_cont <- list(
expansion_cont,
test_data[which(!test_data$Seq_ID %in% expansion_cont$Seq_ID &
test_data$SampleType2 %in% c("COVID-19", "Healthy")),
c("Seq_ID", "SampleType", "NumInClone", "SampleType2")]
)
expansion_cont[[2]][, 2] <- "others"
colnames(expansion_cont[[2]]) <- c("Seq_ID", "large", "NumInClone", "SampleType2")
expansion_cont <- do.call("rbind", expansion_cont)
expansion_cont$large <- factor(expansion_cont$large, levels = c(TRUE, FALSE, "others"),
labels = c("convergent\n(10 or moresequences)", "convergent\n(<10 sequences)",
"other sequences"))
ggplot(expansion_cont, aes(x = large, y = NumInClone)) + geom_boxplot(outlier.shape = NA) +
facet_grid(~ SampleType2, drop = TRUE) +
scale_y_log10(limits = c(1, 500), name = "Number of sequences\nin clone") + cowplot::theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + xlab("")
# cant make the ggpubr::compare_means add brackets & p-value * properly so here are the data
ggpubr::compare_means(NumInClone ~ large, group.by = "SampleType2",
data = expansion_cont, method = "wilcox", ref.group = "other sequences")
| SampleType2 | .y. | group1 | group2 | p | p.adj | p.format | p.signif | method |
|---|---|---|---|---|---|---|---|---|
| Healthy | NumInClone | other sequences | convergent | |||||
| (10 or moresequences) | 0 | 0 | <2e-16 | **** | Wilcoxon | |||
| Healthy | NumInClone | other sequences | convergent | |||||
| (<10 sequences) | 0 | 0 | <2e-16 | **** | Wilcoxon | |||
| COVID-19 | NumInClone | other sequences | convergent | |||||
| (10 or moresequences) | 0 | 0 | <2e-16 | **** | Wilcoxon | |||
| COVID-19 | NumInClone | other sequences | convergent | |||||
| (<10 sequences) | 0 | 0 | <2e-16 | **** | Wilcoxon |
V/J gene usage of large convergent clusters shown above.
Here show relationship between convergent cluster size and the number of patients sharing CDRH3 sequences in the convergent cluster. As expected the larger a cluster the more likely we see more patients sharing sequences in this cluster. Map V/J gene usage onto this plot:
gene_usage <- ddply(membership, c("Jfamily", "Vgene", "cluster", "cluster_size"),
summarise,
n_patient = length(unique(PatientID[SampleType2 == "COVID-19"])),
median_clonesize = median(NumInClone, na.rm = TRUE),
lowerci_clonesize = quantile(NumInClone, probs = 0.025,
na.rm = TRUE),
upperci_clonesize = quantile(NumInClone, probs = 0.975,
na.rm = TRUE))
clustersize <- ddply(gene_usage, c("Vgene", "Jfamily"), summarise,
median_clustersize = mean(cluster_size))
clustersize$VJ <- interaction(clustersize$Vgene, clustersize$Jfamily)
clustersize$VJ <- as.character(clustersize$VJ)
gene_usage$VJ <- interaction(gene_usage$Vgene, gene_usage$Jfamily)
gene_usage$VJ <- factor(gene_usage$VJ,
levels = as.character(clustersize[order(clustersize$median_clustersize,
decreasing = TRUE), "VJ"])
)
gene_usage$Vgene_colour <- replace(gene_usage$Vgene,
which(!gene_usage$Vgene %in% c("IGHV1-69",
"IGHV3-7", "IGHV3-23", "IGHV3-30", "IGHV3-30-3",
"IGHV3-33", "IGHV3-53","IGHV4-34","IGHV4-39")),
"others")
vgene_colours <- c(scales::hue_pal()(10)[c(1,8,2,9,7,6,4,3,10)])
names(vgene_colours) <- c("IGHV1-69", "IGHV3-7", "IGHV3-23", "IGHV3-30", "IGHV3-30-3",
"IGHV3-33", "IGHV3-53","IGHV4-34","IGHV4-39")
vgene_colours <- c(vgene_colours, "others" = "grey")
ggplot(gene_usage, aes(x = cluster_size, y = n_patient, colour = Vgene_colour)) +
geom_point(size = 2) + facet_wrap(~ Jfamily) + cowplot::theme_cowplot() +
scale_colour_manual(values = vgene_colours, name = "") +
xlab("Number of sequences in convergent cluster") +
ylab("Number of patients sharing clones")